
library(MASS)
library(ggplot2)
library(stringr)
library(tidyr)
library(plyr)
library(gridExtra)
library(dplyr)
library(stargazer)
library(margins)
library(reshape2)
library(zoo)
library(interactions)
library(lme4)



################
#### Demographics
##############


remove(list=ls())

setwd("~/Dropbox/Selection Experiments Harvard/Session Data/")

##Get CCES data

cces <- readRDS("cces_2006_2018.rds")

##Subset just the 2018 CCES data
cces_2018 <- cces %>% filter(cces$year == 2018)

##Get Sophie sessions data
sessions <- read.csv("sessionscombined.csv")



se <- function(vector){
  return(sd(vector, na.rm = TRUE) / sqrt(length(vector[!is.na(vector)])))
}

se_proportion <- function(p, n) {
  answer <- sqrt((p*(100-p))/n)
  return(answer)
}


#	Creating a variable that = 1 if the participant actually participated in the study (instead of being excused)
sessions$actuallyparticipated <- ifelse(sessions$boughtTickets > 0 & sessions$Stepgroup.Label == "LotteryExperiment", 1, 0)
sessions <- sessions %>% group_by(sessionno,Participant) %>% mutate(actuallyparticipated=max(actuallyparticipated, na.rm = TRUE))
sessions <- as.data.frame(sessions)

#	Splitting out the demographics and just subsetting those who played the games
sessions_demog <- sessions[ which(sessions$Stepgroup.Label == "Demographics"), ]
sessions_demog <- sessions_demog %>% filter(sessions_demog$actuallyparticipated == 1)

head(sessions_demog)
names(sessions_demog)
sessions_demog$gender %>% unique()
sessions_demog$income %>% unique()
sessions_demog$education %>% unique()
sessions_demog$female <- ifelse(sessions_demog$gender == "m", 0, 1)
nrow(sessions_demog)
sessions_demog$ordered_income <- factor(sessions_demog$income, levels=c("lessThan5000", "to9999", "to12499", "to19999", "to24999", "to29999", "to34999", 
                                                                        "to39999", "to49999", "to59999","to74999" , "to84999", "to99999", 
                                                                        "to124999", "to149999", "to174999" , "moreThan175000"), ordered=TRUE)
sessions_demog$ordered_edu <- factor(sessions_demog$education, levels=c("highSchool", "twelfth" ,"collegeNoDegree", "associateDegree", 
                                                                        "bachelor", "master", "professional" ), ordered=TRUE)
                                     

m1<-lm(female ~   age+ income + education, data = (sessions_demog ))
summary(m1)
m1<-lm(female ~   age+ ordered_income + ordered_edu, data = (sessions_demog ))
summary(m1)

#CCES mean age
mean_age_cces <- weighted.mean(cces_2018$age, cces_2018$weight)

#Mturk mean age
sessions_demog$age <- as.numeric(sessions_demog$age)
mean_age_sessions <- mean(sessions_demog$age, na.rm = TRUE)


#In person Lab pool - Anderson et al. 
mean_age_lab <- 20.5


weighted_proportion <- function(tbl, variable,n_all=60000) {
  variable <- enquo(variable)
  tbl %>% 
    #  filter(!is.na
    #       (!!variable)) %>% 
    #add_tally(name = "n_all") %>% 
    count(!!variable, wt = weight) %>% 
    mutate(prop = n / n_all)
}


cces_2018 %>% count(gender, wt = weight) %>%
  filter(!is.na(gender)) %>%
  add_tally(name = "n_all") %>% 
  mutate(prop = n / n_all)




#CCES weighted female proportion
weighted_proportion(cces_2018, gender)$prop[2]

#CCES weighted white proportion
weighted_proportion(cces_2018, race)$prop[1]


#check that CCES has no missing values for gender
length(is.na(cces_2018$gender))

#CCES female proportion
#gender = 1 means male gender = 2 means female
unique(cces_2018$gender)
perc_female_cces <- weighted_proportion(cces_2018, gender)$prop[2]

#Mturk female proportion
perc_female_sessions <- nrow(sessions_demog[sessions_demog$gender == "f",])/nrow(sessions_demog[sessions_demog$gender == "f" | sessions_demog$gender == "m" ,])

#In person Lab pool - Anderson et al. 
perc_female_lab <- 0.61

#perc_female_se <- se_proportion(perc_female*100, nrow(sessions_demog[sessions_demog$gender == "f" | sessions_demog$gender == "m" ,]))

#check that CCES has no missing values for race
length(is.na(cces_2018$race))


#CCES race proportion
##CCES race 1= white, 2 = black, 3 = hispanic
white_perc_cces <- weighted_proportion(cces_2018, race)$prop[1]
black_perc_cces <- weighted_proportion(cces_2018, race)$prop[2]
hispanic_perc_cces <- weighted_proportion(cces_2018, race)$prop[3]


# Mturk race proportion
unique(sessions_demog$race)
white_perc_sesssions <- sum(sessions_demog$race == "white") / sum(!is.na(sessions_demog$race))
black_perc_sessions <- sum(sessions_demog$race == "black") / sum(!is.na(sessions_demog$race))
hispanic_perc_sessions <- sum(sessions_demog$race == "hispanic") / sum(!is.na(sessions_demog$race))

#In person Lab pool - Anderson et al.
white_perc_lab <- 0.77

#CCES Education
# 1 = No HS, 2 = HS graduate, 3 = some college, 4 = 2Yr, 5 = 4Yr, 6 = Post-Grad
sum(cces$educ == 5, na.rm = TRUE)

#check that CCES has no missing values for educatoin
length(is.na(cces_2018$educ))


weighted_proportion(cces_2018, educ)
#> Using `n` as weighting variable
#> # A tibble: 6 x 4
#>                       educ      n  n_all   prop
#>                  <int+lbl>  <dbl>  <dbl>  <dbl>
#> 1 1 [No HS]                 5655. 60000. 0.0942
#> 2 2 [High School Graduate] 17127. 60000. 0.285 
#> 3 3 [Some College]         13110. 60000. 0.219 
#> 4 4 [2-Year]                6169. 60000. 0.103 
#> 5 5 [4-Year]               11392. 60000. 0.190 
#> 6 6 [Post-Grad]             6547. 60000. 0.109

no_college_perc_cces <- weighted_proportion(cces_2018, educ)$prop[1] + weighted_proportion(cces_2018, educ)$prop[2]


some_college_perc_cces <- weighted_proportion(cces_2018, educ)$prop[3]


college_degree_perc_cces <- weighted_proportion(cces_2018, educ)$prop[4] + weighted_proportion(cces_2018, educ)$prop[5]


post_graduate_perc_cces <- weighted_proportion(cces_2018, educ)$prop[6]

sessions_demog$nocollege <- ifelse(sessions_demog$education == "highSchool" | sessions_demog$education == "twelfth", 1, 0)
no_college_perc_sessions <- mean(sessions_demog$nocollege, na.rm = TRUE)

sessions_demog$somecollege <- ifelse(sessions_demog$education == "collegeNoDegree", 1, 0)
some_college_perc_sessions <- mean(sessions_demog$somecollege, na.rm = TRUE)

sessions_demog$collegedegree <- ifelse(sessions_demog$education == "associateDegree" | sessions_demog$education == "bachelor", 1, 0)
college_degree_perc_sessions <- mean(sessions_demog$collegedegree, na.rm = TRUE)

sessions_demog$postgraduate <- ifelse(sessions_demog$education == "master" | sessions_demog$education == "professional", 1, 0)
post_graduate_perc_sessions <- mean(sessions_demog$postgraduate, na.rm = TRUE)


#In person Lab pool - Anderson et al.
no_college_perc_lab <- 0
some_college_perc_lab <- 1
college_degree_perc_lab <- 0
post_graduate_perc_lab <- 0

#check that CCES has no missing values for income
length(is.na(cces_2018$race))




## CCES income
cces_2018_income_subset <- cces_2018 %>% filter(faminc != "Prefer not to say")
nrow(cces_2018)
nrow(cces_2018_income_subset)

income_cces_denom <- sum(!is.na(cces_2018$faminc) & cces_2018$faminc != "Prefer not to say" & cces_2018$faminc != "Skipped")
weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))
income_0_10k_perc_cces <- weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc))
)$prop[1]
income_10_20k_perc_cces <- weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[2]
income_20_30k_perc_cces <- weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[3]
income_30_40k_perc_cces <- weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[4]
income_40_50k_perc_cces <- weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[5]
income_50_60k_perc_cces <- weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[6]
income_60k_plus_perc_cces <- weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[7] + weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[8] + weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[9] + weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[10] + weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[11] + weighted_proportion(cces_2018_income_subset, faminc,n_all=length(is.na(cces_2018_income_subset$faminc)))$prop[12] 

length(cces_2018$faminc)
sum(is.na(cces_2018$faminc)) # (Missing)
sum(cces_2018$faminc == "Prefer not to say", na.rm = TRUE)
sum(cces_2018$faminc == "Skipped", na.rm = TRUE)


##Sophie sessions income
unique(sessions_demog$income)

sum(sessions_demog$income == "moreThan175000")
income_sessions_denom <- sum(!is.na(sessions_demog$income))
income_0_10k_perc_sessions <- sum(sessions_demog$income == "to9999", na.rm = TRUE)/income_sessions_denom
income_10_20k_perc_sessions <- sum(sessions_demog$income == "to12499" | sessions_demog$income == "to14999" | sessions_demog$income == "to19999", na.rm = TRUE)/income_sessions_denom
income_20_30k_perc_sessions <- sum(sessions_demog$income == "to24999" | sessions_demog$income == "to29999", na.rm = TRUE)/income_sessions_denom
income_30_40k_perc_sessions <- sum(sessions_demog$income == "to34999" | sessions_demog$income == "to39999", na.rm = TRUE)/income_sessions_denom
income_40_50k_perc_sessions <- sum(sessions_demog$income == "to49999", na.rm = TRUE)/income_sessions_denom
income_50_60k_perc_sessions <- sum(sessions_demog$income == "to59999", na.rm = TRUE)/income_sessions_denom
income_60k_plus_perc_sessions <- sum(sessions_demog$income == "to74999" | sessions_demog$income == "to84999" | sessions_demog$income == "to99999" | sessions_demog$income == "to124999" | sessions_demog$income == "to149999"| sessions_demog$income == "to174999" | sessions_demog$income == "moreThan175000", na.rm = TRUE)/income_sessions_denom




unique(cces_2018$faminc)
#In person Lab pool - Anderson et al.
#question was about income of parents
income_0_10k_perc_lab <- 0.07
income_10_20k_perc_lab <- 0.00
income_20_30k_perc_lab <- 0.08
income_30_40k_perc_lab <- 0.00
income_40_50k_perc_lab <- 0.18
income_50_60k_perc_lab <- 0.00
income_60k_plus_perc_lab <- 0.67



library(RColorBrewer)

# simple bar plot for mean age
colors_race <- brewer.pal(2, "Greys")
sample_age <- c("CCES", "Online Sessions", "In-person Lab")
mean_ages <- c(mean_age_cces, mean_age_sessions, mean_age_lab)

png(file = "demographics_age_grey.png")
par(xpd = T, mar = par()$mar + c(0,0,0,5))
barplot(mean_ages, main = "Age", names.arg = sample_age,ylim=c(0,60), xlab = "Sample", ylab = "Mean Age", col = colors_race[2])

par(mar=c(5, 4, 4, 2) + 0.1)
# Save the file
dev.off()




#simple bar plot for white percentages
colors_race <- brewer.pal(2, "Greys")
sample_race <- c("CCES", "Online Sessions", "In-person Lab")
race_labels <- c("White", "Non-white")
proportions_race <- matrix(c(white_perc_cces, white_perc_sesssions, white_perc_lab, 1-white_perc_cces, 1-white_perc_sesssions, 1-white_perc_lab), nrow = 2, ncol = 3, byrow = TRUE)


# Give the chart file a name
png(file = "demographics_race_grey.png")
par(xpd = T, mar = par()$mar + c(0,0,0,5))

# Create the bar chart
barplot(proportions_race, main = "Race", names.arg = sample_race, xlab = "Sample", ylab = "Race Proportions", col = colors_race)

# Add the legend to the chart

legend(3.9, 0.6,race_labels, cex = 0.5, fill = colors_race)
par(mar=c(5, 4, 4, 2) + 0.1)
# Save the file
dev.off()



#simple bar plot for female percentages
colors_female <- brewer.pal(2, "Greys")
sample_female <- c("CCES", "Online Sessions", "In-person Lab")
gender_labels <- c("Female", "Male")
proportions_gender <- matrix(c(perc_female_cces, perc_female_sessions, perc_female_lab, 1-perc_female_cces, 1-perc_female_sessions, 1-perc_female_lab), nrow = 2, ncol = 3, byrow = TRUE)


# Give the chart file a name
png(file = "demographics_gender_grey.png")
par(xpd = T, mar = par()$mar + c(0,0,0,5))

# Create the bar chart
barplot(proportions_gender, main = "Gender", names.arg = sample_female, xlab = "Sample", ylab = "Gender Proportions", col = colors_female)

# Add the legend to the chart

legend(3.9, 0.6,gender_labels, cex = 0.5, fill = colors_female)
par(mar=c(5, 4, 4, 2) + 0.1)
# Save the file
dev.off()



#stacked bar plot for education levels
colors_edu <- brewer.pal(4, "Greys")
sample_edu <- c("CCES", "Online Sessions", "In-person Lab")
education_levels <- c("No College", "Some College", "College Degree", "Post Graduate Degree")
proportions_edu <- matrix(c(no_college_perc_cces, no_college_perc_sessions, no_college_perc_lab, some_college_perc_cces, some_college_perc_sessions, some_college_perc_lab, college_degree_perc_cces, college_degree_perc_sessions, college_degree_perc_lab, post_graduate_perc_cces, post_graduate_perc_sessions, post_graduate_perc_lab), nrow = 4, ncol = 3, byrow = TRUE)



# Give the chart file a name
png(file = "demographics_edu_grey.png")
par(xpd = T, mar = par()$mar + c(0,0,0,5))

# Create the bar chart
barplot(proportions_edu, main = "Education Levels", names.arg = sample_edu, xlab = "Sample", ylab = "Education Level Proportion", col = colors_edu)

# Add the legend to the chart

legend(3.9, 0.6,education_levels, cex = 0.5, fill = colors_edu)
par(mar=c(5, 4, 4, 2) + 0.1)
# Save the file
dev.off()


#stacked bar plot for income levels
colors_inc <- brewer.pal(7, "Greys")
sample_inc <- c("CCES", "Online Sessions", "In-person Lab")
income_levels <- c("$0-$10,000", "$10,000 - $20,000", "$20,000 - $30,000", "$30,000 - $40,000", "$40,000 - $50,000", "$50,000 - $60,000", "$60,000 +")
proportions_inc <- matrix(c(income_0_10k_perc_cces, income_0_10k_perc_sessions, income_0_10k_perc_lab, income_10_20k_perc_cces, income_10_20k_perc_sessions, income_10_20k_perc_lab, income_20_30k_perc_cces, income_20_30k_perc_sessions, income_20_30k_perc_lab, income_30_40k_perc_cces, income_30_40k_perc_sessions, income_30_40k_perc_lab, income_40_50k_perc_cces, income_40_50k_perc_sessions, income_40_50k_perc_lab, income_50_60k_perc_cces, income_50_60k_perc_sessions, income_50_60k_perc_lab, income_60k_plus_perc_cces, income_60k_plus_perc_sessions, income_60k_plus_perc_lab), nrow = 7, ncol = 3, byrow = TRUE)


# Give the chart file a name
png(file = "demographics_income_grey.png")

# Create the bar chart
par(xpd = T, mar = par()$mar + c(0,0,0,5))
barplot(proportions_inc, main = "Income Levels", names.arg = sample_inc, xlab = "Sample", ylab = "Income Level Proportion", col = colors_inc)
# Add the legend to the chart
legend(3.9, 0.6,income_levels, cex = 0.5, fill = colors_inc)
par(mar=c(5, 4, 4, 2) + 0.1)
# Save the file
dev.off()


